// SPDX-License-Identifier: BSD-2-Clause
/* Copyright (C) 2014 - 2021 Intel Corporation. */

#include "framework.hpp"
#include <algorithm> // sort
#include <cassert>
#include <iostream>
#include <math.h> // log2

namespace performance_tests
{
namespace ch = std::chrono;
using std::cout;
using std::endl;
using std::mutex;
using std::unique_lock;

#ifdef __DEBUG
mutex g_coutMutex;
int g_msgLevel = 1;
#endif

void Barrier::wait()
{
    unique_lock<mutex> lock(m_barrierMutex);
    // Decrement number of threads awaited at the barrier
    m_waiting--;
    if (m_waiting == 0) {
        // Called by the last expected thread - notify all waiting threads and
        // exit
        m_cVar.notify_all();
        // Store the time when barrier was released
        if (m_releasedAt.tv_sec == 0 && m_releasedAt.tv_nsec == 0) {
            clock_gettime(CLOCK_MONOTONIC, &m_releasedAt);
        }
        return;
    }
    // Wait unitl the last expected thread calls wait() on Barrier instance, or
    // timeout occurs
    m_cVar.wait_until(lock, ch::system_clock::now() + ch::seconds(10),
                      []() { return GetInstance().m_waiting == 0; });
}

// Worker class
Worker::Worker(uint32_t actionsCount, const vector<size_t> &allocationSizes,
               Operation *freeOperation, memkind_t kind)
    : m_actionsCount(actionsCount),
      m_allocationSizes(allocationSizes),
      m_actions(vector<Action *>(actionsCount, nullptr)),
      m_kind(kind)
{
    assert(freeOperation->getName() == OperationName::Free);
}

Worker::~Worker()
{
    for (Action *action : m_actions) { // each action
        delete action;
    }
}

void Worker::init(const vector<Operation *> &testOperations,
                  Operation *&freeOperation)
{
    for (uint32_t i = 0; i < m_actionsCount; i++) {
        int bucketSize = rand() % Operation::MaxBucketSize;

        for (Operation *operation : testOperations) { // each operation
            if (operation->checkCondition(bucketSize)) {
                size_t size =
                    m_allocationSizes[m_allocationSizes.size() > 1
                                          ? rand() % m_allocationSizes.size()
                                          : 0];
                m_actions[i] = new Action(
                    operation, freeOperation, m_kind, size, log2(rand() % size),
                    sizeof(void *) *
                        (1 << ((rand() % Operation::MemalignMaxMultiplier))));
                break;
            }
        }
    }
}

void Worker::run()
{
    m_thread = new thread(&Worker::work, this);
}

#ifdef __DEBUG
uint16_t Worker::getId()
{
    return m_threadId;
}
void Worker::setId(uint16_t threadId)
{
    m_threadId = threadId;
}
#endif

void Worker::finish()
{
    if (m_thread != nullptr) {
        m_thread->join();
        delete m_thread;
    }
}

void Worker::work()
{
    EMIT(1, "Entering barrier " << m_threadId)
    Barrier::GetInstance().wait();
    EMIT(1, "Starting thread " << m_threadId)
    for (Action *action : m_actions) {
        action->alloc();
    }
}

void Worker::clean()
{
    EMIT(2, "Cleaning thread " << m_threadId)
    for (Action *action : m_actions) {
        action->free();
    }
    EMIT(1, "Thread " << m_threadId << " finished")
}

// PerformanceTest class
PerformanceTest::PerformanceTest(size_t repeatsCount, size_t threadsCount,
                                 size_t operationsCount)
    : m_repeatsCount(repeatsCount),
      m_discardCount(repeatsCount * (distardPercent / 100.0)),
      m_threadsCount(threadsCount),
      m_operationsCount(operationsCount),
      m_executionMode(ExecutionMode::SingleInteration)
{}

void PerformanceTest::setAllocationSizes(const vector<size_t> &allocationSizes)
{
    m_allocationSizes = allocationSizes;
}

void PerformanceTest::setOperations(
    const vector<vector<Operation *>> &testOperations, Operation *freeOperation)
{
    m_testOperations = testOperations;
    m_freeOperation = freeOperation;
}

void PerformanceTest::setExecutionMode(ExecutionMode executionMode)
{
    m_executionMode = executionMode;
}

void PerformanceTest::setKind(const vector<memkind_t> &kinds)
{
    m_kinds = kinds;
}

inline void PerformanceTest::runIteration()
{
    timespec iterationStop, iterationStart;

    Barrier::GetInstance().reset(m_threadsCount);
    for (Worker *worker : m_workers) {
        worker->run();
    }
    for (Worker *worker : m_workers) {
        worker->finish();
    }
    EMIT(1, "Alloc completed");
    clock_gettime(CLOCK_MONOTONIC, &iterationStop);
    iterationStart = Barrier::GetInstance().releasedAt();
    m_durations.push_back(
        (iterationStop.tv_sec * NanoSecInSec + iterationStop.tv_nsec) -
        (iterationStart.tv_sec * NanoSecInSec + iterationStart.tv_nsec));
    for (Worker *worker : m_workers) {
        worker->clean();
    }
}

void PerformanceTest::prepareWorkers()
{
    for (size_t threadId = 0; threadId < m_threadsCount; threadId++) {
        m_workers.push_back(new Worker(
            m_operationsCount, m_allocationSizes, m_freeOperation,
            m_kinds.size() > 0 ? m_kinds[threadId % m_kinds.size()] : nullptr));
#ifdef __DEBUG
        m_workers.back()->setId(threadId);
#endif
        if (m_executionMode == ExecutionMode::SingleInteration) {
            // In ManyIterations mode, operations will be set for each thread at
            // the beginning of each iteration
            m_workers.back()->init(
                m_testOperations[threadId % m_testOperations.size()],
                m_freeOperation);
        }
    }
}

Metrics PerformanceTest::getMetrics()
{
    uint64_t totalDuration = 0;

    std::sort(m_durations.begin(), m_durations.end());

    m_durations.erase(m_durations.end() - m_discardCount, m_durations.end());
    for (uint64_t &duration : m_durations) {
        totalDuration += duration;
    }

    Metrics metrics;

    metrics.executedOperations =
        m_durations.size() * m_threadsCount * m_operationsCount;
    metrics.totalDuration = totalDuration;
    metrics.repeatDuration =
        (double)totalDuration / ((uint64_t)m_durations.size() * NanoSecInSec);
    metrics.iterationDuration = metrics.repeatDuration;
    if (m_executionMode == ExecutionMode::ManyIterations) {
        metrics.executedOperations *= m_testOperations.size();
        metrics.iterationDuration /= m_testOperations.size();
    }
    metrics.operationsPerSecond =
        (double)metrics.executedOperations * NanoSecInSec / totalDuration;
    metrics.avgOperationDuration =
        (double)totalDuration / metrics.executedOperations;
    assert(metrics.iterationDuration != 0.0);
    return metrics;
}

void PerformanceTest::writeMetrics(const string &suiteName,
                                   const string &caseName,
                                   const string &fileName)
{
    Metrics metrics = getMetrics();

    // For thousands separation
    setlocale(LC_ALL, "");
    if (!fileName.empty()) {
        FILE *f;
        if ((f = fopen(fileName.c_str(), "a+"))) {
            fprintf(f, "%s;%s;%zu;%zu;%lu;%f;%f;%f;%f\n", suiteName.c_str(),
                    caseName.c_str(), m_repeatsCount, m_threadsCount,
                    metrics.executedOperations, metrics.operationsPerSecond,
                    metrics.avgOperationDuration, metrics.iterationDuration,
                    metrics.repeatDuration);
            fclose(f);
        }
    }
    printf("Operations/sec:\t\t\t%'f\n"
           "Avg. operation duration:\t%f nsec\n"
           "Iteration duration:\t\t%f sec\n"
           "Repeat duration:\t\t%f sec\n",
           metrics.operationsPerSecond, metrics.avgOperationDuration,
           metrics.iterationDuration, metrics.repeatDuration);
}

int PerformanceTest::run()
{
    if (m_testOperations.empty() || m_allocationSizes.empty() ||
        m_freeOperation == nullptr) {
        cout << "ERROR: Test not initialized" << endl;
        return 1;
    }
    // Create threads
    prepareWorkers();
    // warmup kinds
    void *alloc = nullptr;

    for (const memkind_t &kind : m_kinds) {
        m_testOperations[0][0]->perform(kind, alloc, 1e6);
        m_freeOperation->perform(kind, alloc);
    }
    for (size_t repeat = 0; repeat < m_repeatsCount; repeat++) {
        EMIT(1, "Test run #" << repeat)
        if (m_executionMode == ExecutionMode::SingleInteration) {
            runIteration();
        } else {
            // Perform each operations list in separate iteration, for each
            // thread
            for (vector<Operation *> &ops : m_testOperations) {
                for (Worker *worker : m_workers) {
                    worker->init(ops, m_freeOperation);
                }
                runIteration();
            }
        }
    }
    return 0;
}

void PerformanceTest::showInfo()
{
    printf(
        "Test parameters: %lu repeats, %lu threads, %d operations per thread\n",
        m_repeatsCount, m_threadsCount, m_operationsCount);
    printf("Thread memory allocation operations:\n");
    for (unsigned long i = 0; i < m_testOperations.size(); i++) {
        if (m_executionMode == ExecutionMode::SingleInteration) {
            printf("\tThread %lu,%lu,...\n", i, i + (m_testOperations.size()));
        } else {
            printf("\tIteration %lu\n", i);
        }
        for (const Operation *op : m_testOperations[i]) {
            printf("\t\t %s (bucket size: %d)\n", op->getNameStr().c_str(),
                   op->getBucketSize());
        }
    }
    printf("Memory free operation:\n\t\t%s\n",
           m_freeOperation->getNameStr().c_str());
    printf("Allocation sizes:\n");
    for (size_t size : m_allocationSizes) {
        printf("\t\t%lu bytes\n", size);
    }
}
} // namespace performance_tests
